# -*- coding:utf-8 -*-
'''
@author: xiaoming
@contact: lishihui0129@163.com
@time: 2017/6/28 9:55
@desc:
'''

import re
import os
from myUtil import urlPattern,flagSplitPattern,domainSuffix
from mysqlHelper import insertSql,close,connectDB


def urlExtract(string):
    '''
    :param string: 待提取对象
    :return:提取结果
    '''

    temp=[]

    tmp=re.findall(domainSuffix,string)
    if len(tmp)!=0:
        if 'http' in string:
            temp.append(string[string.index('http'):])
        elif 'www' in string:
            temp.append(string[string.index('www'):])
        # temp.append(string)
    return temp


    '''
    res=re.findall(urlPattern,string)
    res=list(set(res))
    # flag=prefixFlag(string)
    for item in res:
        if ('www' in item or 'http' in item) and len(item.strip())>6:
            # linshi=item.replace('//',"")
            temp.append(item)
        else:
            continue
    # return ";".join(temp)
    # print temp
    if ('www' in string or 'http' in string) and len(string.strip())>5 and len(temp)==0:
        # linshi1=string.replace('//',"")
        temp.append(string)
    return temp
    '''
def prefixFlag(string):
    '''
    :param string: 待提取对象
    :return:前缀信息
    '''
    linshi=string.lower()
    if '网址' in linshi:
        ind=linshi.index('网址')
    elif '网站' in linshi:
        ind=linshi.index('网站')
    elif '官网' in linshi:
        ind=linshi.index('官网')
    elif '链接' in linshi:
        ind=linshi.index('链接')
    elif 'web' in linshi:
        ind=linshi.index('web')
    else:
        ind=0


    temp=string[:ind]
    res=re.split(flagSplitPattern,temp)
    return res[-1].strip()



def insertUrlTable(string):
    res=re.findall(urlPattern,string)
    res=list(set(res))
    flag=prefixFlag(string)
    # flag="临时"
    # temp=[]
    connectDB()
    sql="insert into url(url_value,prefix) values(%s,%s)"

    for each in res:
        param=(each,flag)
        insertSql(sql,param)
    close()

if __name__=='__main__':
    workdir='.\output'
    fileList=os.listdir(workdir)
    if 'url.txt' in fileList:
        targetFile=os.path.join(workdir,'url.txt')
        res=open(targetFile).readlines()
        for line in res:
            # eachline=line.replace(" ","").replace("　","")
            if "http" in line or 'www' in line:
                print urlExtract(line)
                # insertUrlTable(line)
            else:
                continue
    else:
        pass
    '''
    for each in os.listdir(workdir):
        print each
        print os.path.join(workdir,each)

    worddir='./testData'
    for each in os.listdir(worddir):
        # temp=each.decode('gbk').encode('utf-8')
        input='./Txtoutput/'+Word2Txt.getMd5(each)+".txt"
        res=open(input).readlines()

        for line in res:
            eachline=line.replace(" ","").replace("　","")
            if isUrl(eachline) and ("http" in eachline or 'www' in eachline):
                print urlExtract(line)
            else:
                continue
    '''